Script to visualize the results from
kappa_loss_analysis.R.
Data
# Load data
results <- read.csv("results/threshold_results.csv", stringsAsFactors = FALSE)
klc_data <- readRDS("results/KLC_plot_deciles.rds")
# Clean up data
results$critical_percentage <- as.numeric(as.character(results$critical_percentage))
results$kappa_at_critical <- as.numeric(as.character(results$kappa_at_critical))
klc_data <- klc_data %>%
filter(noise %in% c(10, 20, 30)) %>%
select(-c(accuracy, kappa, dataset_order, method_order))
Summary Statistics
# Display the raw data from CSV
kable(head(results, 20), caption = "Raw Data from threshold_results.csv (First 20 Rows)")
Raw Data from threshold_results.csv (First 20 Rows)
| analcatdata_authorship |
C5.0 |
10 |
0.05 |
100 |
0.00 |
| badges2 |
C5.0 |
10 |
0.05 |
90 |
0.05 |
| banknote |
C5.0 |
10 |
0.05 |
100 |
0.00 |
| blood-transfusion-service-center |
C5.0 |
10 |
0.05 |
70 |
0.05 |
| breast-w |
C5.0 |
10 |
0.05 |
100 |
0.01 |
| cardiotocography |
C5.0 |
10 |
0.05 |
10 |
0.05 |
| climate-model-simulation-crashes |
C5.0 |
10 |
0.05 |
100 |
0.05 |
| cmc |
C5.0 |
10 |
0.05 |
40 |
0.05 |
| credit-g |
C5.0 |
10 |
0.05 |
100 |
0.04 |
| diabetes |
C5.0 |
10 |
0.05 |
100 |
0.03 |
| eucalyptus |
C5.0 |
10 |
0.05 |
70 |
0.04 |
| iris |
C5.0 |
10 |
0.05 |
100 |
0.02 |
| kc1 |
C5.0 |
10 |
0.05 |
70 |
0.05 |
| liver-disorders |
C5.0 |
10 |
0.05 |
20 |
0.05 |
| mfeat-factors |
C5.0 |
10 |
0.05 |
100 |
0.00 |
| mfeat-karhunen |
C5.0 |
10 |
0.05 |
100 |
0.01 |
| mfeat-zernike |
C5.0 |
10 |
0.05 |
100 |
0.02 |
| ozone-level-8hr |
C5.0 |
10 |
0.05 |
100 |
0.01 |
| pc4 |
C5.0 |
10 |
0.05 |
30 |
0.04 |
| phoneme |
C5.0 |
10 |
0.05 |
70 |
0.05 |
# Create interactive table of the full dataset
datatable(results,
options = list(
pageLength = 10,
scrollX = TRUE,
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel')
),
extensions = 'Buttons',
caption = "Complete Raw Data from threshold_results.csv")
# Summary statistics by threshold level
threshold_summary <- results %>%
group_by(threshold) %>%
summarise(
mean_critical = mean(critical_percentage, na.rm = TRUE),
median_critical = median(critical_percentage, na.rm = TRUE),
)
kable(threshold_summary, caption = "Summary of Critical Percentages by Threshold Level")
Summary of Critical Percentages by Threshold Level
| 0.05 |
68.76679 |
80 |
| 0.10 |
84.16361 |
100 |
| 0.15 |
90.68987 |
100 |
# Summary by technique
technique_summary <- results %>%
group_by(technique) %>%
summarise(
mean_critical = mean(critical_percentage, na.rm = TRUE),
median_critical = median(critical_percentage, na.rm = TRUE),
) %>%
arrange(median_critical)
kable(technique_summary, caption = "Summary of Critical Percentages by ML Technique")
Summary of Critical Percentages by ML Technique
| rbfDDA |
60.81197 |
80 |
| C5.0 |
79.10256 |
100 |
| JRip |
81.79487 |
100 |
| PART |
80.17094 |
100 |
| bayesglm |
81.53846 |
100 |
| ctree |
81.45299 |
100 |
| fda |
76.96581 |
100 |
| gbm |
84.23077 |
100 |
| gcvEarth |
75.98291 |
100 |
| knn |
88.07692 |
100 |
| lvq |
80.89744 |
100 |
| mlpML |
94.61538 |
100 |
| multinom |
71.83761 |
100 |
| naive_bayes |
85.64103 |
100 |
| rda |
73.50427 |
100 |
| rf |
84.10256 |
100 |
| rfRules |
87.35043 |
100 |
| rpart |
80.59829 |
100 |
| simpls |
86.28205 |
100 |
| svmLinear |
84.52991 |
100 |
| svmRadial |
85.85470 |
100 |
Visualization of Critical Percentages
Kappa Loss Curves Visualization
# Get unique dataset names and technique names
datasets <- unique(klc_data$dataset_name)
method_names <- unique(klc_data$technique)
# Create a new column to control the order of datasets
klc_data$dataset_order <- factor(klc_data$dataset_name, levels = datasets)
# Create a new column to control the order of methods
klc_data$method_order <- factor(klc_data$technique, levels = method_names)
# Create custom labels for methods (a-u) and datasets (1-26)
method_labels <- letters[1:length(method_names)]
names(method_labels) <- method_names
dataset_labels <- as.character(1:length(datasets))
names(dataset_labels) <- datasets
# Create plot
p <- ggplot(klc_data, aes(x = percentage, y = kappa_loss, color = factor(noise))) +
geom_point(size = 0.8, alpha = 0.6) +
geom_line(aes(group = factor(noise)), linewidth = 0.8) +
labs(title = "Kappa Loss Curves by Dataset, Technique, and Noise Level",
x = "Percentage of Instances",
y = "Kappa Loss",
color = "Noise Level") +
theme_bw() +
scale_y_continuous(limits = c(0.0, 1), breaks = seq(0, 1, by = 0.2)) +
facet_grid(method_order ~ dataset_order, scales = "free",
labeller = labeller(method_order = method_labels, dataset_order = dataset_labels)) +
theme(strip.text = element_text(size = 7),
axis.text = element_text(size = 6),
legend.position = "bottom")
# Print plot
print(p)

Distribution of Critical Percentages
ggplot(results %>% filter(!is.na(critical_percentage)),
aes(x = critical_percentage, fill = as.factor(threshold))) +
geom_density(alpha = 0.7) +
labs(title = "Distribution of Critical Percentages by Threshold Level",
x = "Critical Percentage of Instances",
y = "Density",
fill = "Threshold") +
theme_minimal() +
scale_fill_manual(values = c("0.05" = "#94D2BD", "0.1" = "#0A9396", "0.15" = "#005F73")) +
theme(legend.position = "bottom")

Kappa Loss Analysis
Average Kappa Loss by Threshold
# Calculate average kappa loss by threshold
kappa_loss_by_threshold <- results %>%
filter(!is.na(kappa_at_critical)) %>%
group_by(threshold) %>%
summarise(
mean_kappa_loss = mean(kappa_at_critical, na.rm = TRUE),
median_kappa_loss = median(kappa_at_critical, na.rm = TRUE),
)
# Display summary table
kable(kappa_loss_by_threshold,
caption = "Summary of Kappa Loss by Threshold Level",
digits = 3)
Summary of Kappa Loss by Threshold Level
| 0.05 |
0.029 |
0.04 |
| 0.10 |
0.051 |
0.05 |
| 0.15 |
0.064 |
0.06 |
Kappa Loss by Machine Learning Technique and Threshold
# Calculate average kappa loss by technique and threshold
kappa_loss_by_technique <- results %>%
filter(!is.na(kappa_at_critical)) %>%
group_by(technique, threshold) %>%
summarise(
mean_kappa_loss = mean(kappa_at_critical, na.rm = TRUE),
n_values = sum(!is.na(kappa_at_critical))
) %>%
ungroup()
# Create a heatmap of kappa loss by technique and threshold
ggplot(kappa_loss_by_technique, aes(x = as.factor(threshold), y = reorder(technique, -mean_kappa_loss), fill = mean_kappa_loss)) +
geom_tile() +
scale_fill_viridis_c() +
labs(title = "Average Kappa Loss by ML Technique and Threshold",
x = "Threshold",
y = "Machine Learning Technique",
fill = "Mean Kappa Loss") +
theme_minimal() +
theme(axis.text.y = element_text(size = 8)) +
geom_text(aes(label = sprintf("%.2f", mean_kappa_loss)), size = 3)
